{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Titanic Dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "While the purpose of this exercise right now is for a non python based blog, eventually this will be a tutorial in itself I imagine. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load the Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn import tree \n",
    "import numpy as np\n",
    "\n",
    "url = \"https://raw.githubusercontent.com/mGalarnyk/Python_Tutorials/master/Kaggle/Titanic/train.csv\"\n",
    "# load dataset into Pandas DataFrame\n",
    "df = pd.read_csv(url)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Change sex to binary\n",
    "df['Sex'] = df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Take subset of the dataset\n",
    "\n",
    "df = df[['Sex', 'Age', 'Fare', 'Survived']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    577\n",
       "0    314\n",
       "Name: Sex, dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.Sex.value_counts(dropna = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8.0500      43\n",
       "13.0000     42\n",
       "7.8958      38\n",
       "7.7500      34\n",
       "26.0000     31\n",
       "10.5000     24\n",
       "7.9250      18\n",
       "7.7750      16\n",
       "26.5500     15\n",
       "0.0000      15\n",
       "7.2292      15\n",
       "7.8542      13\n",
       "8.6625      13\n",
       "7.2500      13\n",
       "7.2250      12\n",
       "16.1000      9\n",
       "9.5000       9\n",
       "24.1500      8\n",
       "15.5000      8\n",
       "56.4958      7\n",
       "52.0000      7\n",
       "14.5000      7\n",
       "14.4542      7\n",
       "69.5500      7\n",
       "7.0500       7\n",
       "31.2750      7\n",
       "46.9000      6\n",
       "30.0000      6\n",
       "7.7958       6\n",
       "39.6875      6\n",
       "            ..\n",
       "7.1417       1\n",
       "42.4000      1\n",
       "211.5000     1\n",
       "12.2750      1\n",
       "61.1750      1\n",
       "8.4333       1\n",
       "51.4792      1\n",
       "7.8875       1\n",
       "8.6833       1\n",
       "7.5208       1\n",
       "34.6542      1\n",
       "28.7125      1\n",
       "25.5875      1\n",
       "7.7292       1\n",
       "12.2875      1\n",
       "8.6542       1\n",
       "8.7125       1\n",
       "61.3792      1\n",
       "6.9500       1\n",
       "9.8417       1\n",
       "8.3000       1\n",
       "13.7917      1\n",
       "9.4750       1\n",
       "13.4167      1\n",
       "26.3875      1\n",
       "8.4583       1\n",
       "9.8375       1\n",
       "8.3625       1\n",
       "14.1083      1\n",
       "17.4000      1\n",
       "Name: Fare, Length: 248, dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.Fare.value_counts(dropna = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8.0500      43\n",
       "13.0000     42\n",
       "7.8958      38\n",
       "7.7500      34\n",
       "26.0000     31\n",
       "10.5000     24\n",
       "7.9250      18\n",
       "7.7750      16\n",
       "26.5500     15\n",
       "0.0000      15\n",
       "7.2292      15\n",
       "7.8542      13\n",
       "8.6625      13\n",
       "7.2500      13\n",
       "7.2250      12\n",
       "16.1000      9\n",
       "9.5000       9\n",
       "24.1500      8\n",
       "15.5000      8\n",
       "56.4958      7\n",
       "52.0000      7\n",
       "14.5000      7\n",
       "14.4542      7\n",
       "69.5500      7\n",
       "7.0500       7\n",
       "31.2750      7\n",
       "46.9000      6\n",
       "30.0000      6\n",
       "7.7958       6\n",
       "39.6875      6\n",
       "            ..\n",
       "7.1417       1\n",
       "42.4000      1\n",
       "211.5000     1\n",
       "12.2750      1\n",
       "61.1750      1\n",
       "8.4333       1\n",
       "51.4792      1\n",
       "7.8875       1\n",
       "8.6833       1\n",
       "7.5208       1\n",
       "34.6542      1\n",
       "28.7125      1\n",
       "25.5875      1\n",
       "7.7292       1\n",
       "12.2875      1\n",
       "8.6542       1\n",
       "8.7125       1\n",
       "61.3792      1\n",
       "6.9500       1\n",
       "9.8417       1\n",
       "8.3000       1\n",
       "13.7917      1\n",
       "9.4750       1\n",
       "13.4167      1\n",
       "26.3875      1\n",
       "8.4583       1\n",
       "9.8375       1\n",
       "8.3625       1\n",
       "14.1083      1\n",
       "17.4000      1\n",
       "Name: Fare, Length: 248, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.Fare.value_counts(dropna = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Impute age with mean \n",
    "df.loc[df.Age.isna(), 'Age'] = np.ceil(df.Age.mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "clf = tree.DecisionTreeClassifier(max_depth=2) \n",
    "clf = clf.fit(df[['Sex', 'Age']], df[['Survived']]) \n",
    "tree.export_graphviz(clf,\n",
    "                     out_file=\"decisionTreeTitantic.dot\",\n",
    "                     feature_names=['Sex', 'Age'],\n",
    "                     class_names=['Dead', 'Alive'],\n",
    "                     proportion = True,\n",
    "                     rotate = True, \n",
    "                     filled = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "!dot -Tpng decisionTreeTitantic.dot -o decisionTreeTitantic.png"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['setosa', 'versicolor', 'virginica'], dtype='|S10')"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iris.target_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
       "       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
       "       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iris.target"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda root]",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
